############################################################################
#Functions created to clean, merge, combine, and analyze data #filed edited in 01/20/22 to remove comments

# demean: demeaning function
# inv_hyp: logging function
# clean.accents: cleaning names
# calculate_mean_effects_index (EGAP)  
# imp.mice  
# scalev  
# validate_cpfs
# summary_stats functions
# inpute mean values
# est_cace: IV lm_robust (with and without controls)
# est_lin:  ITT lin estimation (with and without controls)
# est_lm: ITT lm_robust

############################################################################

# Mean Effects Index (From EGAP's methods tools)
calculate_mean_effects_index <- function(Z, outcome_mat, to_reorient
                                         , reorient = FALSE, greedy = TRUE){
  if(length(Z) != nrow(outcome_mat)) 
    stop("Error: Treatment assignment, outcome matrix require same n!")
  c_mean <- apply(X = outcome_mat[Z==0,], MARGIN = 2, FUN = mean, na.rm = T)
  c_sd <- apply(X = outcome_mat[Z==0,], MARGIN = 2, FUN = sd, na.rm = T)
  z_score <- t(t(sweep(outcome_mat, 2, c_mean))/ c_sd)
  index_numerator <- rowSums(z_score)
  if(greedy == TRUE){
    n_outcomes <- rowSums(!is.na(z_score))
  }
  else if(greedy == FALSE){
    n_outcomes <- ncol(outcome_mat)
  }
  index <- index_numerator/n_outcomes
  index <-  (index - mean(index[Z==0], na.rm =T))/sd(index[Z==0], na.rm =T)
  return(index)
}

#Muiltiple imputation of missing items in indices
#Follows Metaketa II's code, but implemented as funtion for MCMV project 
imp.mice <- function(x) {
  to.drop <- which(apply(is.na(x),1,sum)==ncol(x)) #all items are missing 
  cat("Dropping",length(to.drop),"observations for which all items in index are missing\n")
  predictor.matrix <- matrix(data = 1,
                             ncol = ncol(x),
                             nrow = ncol(x))
  diag(predictor.matrix) <- 0
  imp <- mice(data = x,
              m = 1,
              method = "norm.predict",
              predictorMatrix = predictor.matrix
  )
  out <- complete(imp)
  out[to.drop,1:ncol(x)] <- NA #remove those for whom we don't have any observation
  return(out)
}

imp.meanmode <- function(x){
  prop.na <- sum(is.na(x))/length(x)
  if(prop.na==0) {(cat("No missing values\n")); return(x)}
  if(prop.na>0.1) {cat("Need to create additional indicator\n"); return(x)}
  if(prop.na<0.1){ 
    if(length(unique(x))<6){ #categorical variable, use mode
      x[which(is.na(x))] <- as.numeric(names(sort(table(x),decreasing=T)))[1]
    }else{  #continuous variable use mean
      x[which(is.na(x))] <- mean(x,na.rm=T)
    }
    return(x)}}

#Summary stats function
summary_stats <- function(data=data){
  n_obs <- sapply(data, function(x) sum(!is.na(x))) 
  min <- as.matrix(apply(data, 2, FUN=min, na.rm=T))
  mean <- as.matrix(apply(data, 2, FUN=mean, na.rm=T))
  median <- as.matrix(apply(data, 2, FUN=median, na.rm=T))
  max <- as.matrix(apply(data, 2, FUN=max, na.rm=T))
  sd <- as.matrix(apply(data, 2, FUN=sd, na.rm=T))
  missing <- sapply(data, function(x) sum(is.na(x)))  
  summary <- as.data.frame(cbind(n_obs, min, mean, median, sd, max, missing))
  names(summary) <- c("Number of Obs.", "Min.", "Mean", "Median", "Std. Dev.", "Max", "Missing")
  return(summary)
}


#Estimation functions

est_lm <- function(index, treat = "Z", outcomes=outcomes, ctrls=NULL, fe= NULL
                   , clusters=NULL, the.data=survey, w=NULL){
  results_t <- as.data.frame(matrix(NA, 1, 9))
  the.data$Z <- as.numeric(the.data[,treat])
  the.data$clusters <- if(is.null(clusters)){NULL}else{as.numeric(factor(the.data[,clusters]))}
  the.data$block <- if(is.null(fe)){0}else{factor(the.data[,fe])}
  if (length(ctrls)==0){
    the.formula <- formula(paste(outcomes[index]," ~ Z"))
    reg <- lm_robust(formula=the.formula 
                     , data=the.data
                     , fixed_effects = ~block
                     , se_type = "stata"
                     , clusters = the.data$clusters
                     , weights = w) 
    results_t[1:7] <- round(summary(reg)$coef["Z",],5)
    results_t[8] <- reg$nobs #due to package update
    ## Standardized Effect
    tmp <- data.frame(y = the.data[,outcomes[index]], g = the.data$Z)
    pool_sd <- pooled.sd(na.omit(tmp))
    results_t[1,9] <- round(results_t[1,1]/pool_sd,5) 
  }else{#with controls....
    the.formula.covariates <- formula(paste(outcomes[index], paste("~ Z + ",paste(ctrls,collapse="+"))))
    reg <- lm_robust(formula=the.formula.covariates 
                     , data=the.data
                     , fixed_effects = ~block
                     , clusters = the.data$clusters
                     , se_type = "stata"
                     , weights = w) 
    results_t[1:7] <- round(summary(reg)$coef["Z",],5)
    results_t[8] <- reg$nobs #due to package update
    ## Standardized Effect
    tmp <- data.frame(y = the.data[,outcomes[index]], g = the.data$Z)
    pool_sd <- pooled.sd(na.omit(tmp))
    results_t[1,9] <- round(results_t[1,1]/pool_sd,5)
  }
  names(results_t) <- c(colnames(summary(reg)$coef),"N","Std. Estimate")
  return(results_t)
}



est_lin <- function(index, treat = "Z", outcomes=outcomes, ctrls=NULL
                    , clusters=NULL, the.data=survey, w=NULL){
  results_t <- as.data.frame(matrix(NA, 1, 9))
  the.data$Z <- as.numeric(the.data[,treat])
  the.formula <- formula(paste(outcomes[index]," ~ Z"))
  the.data$clusters <- if(is.null(clusters)){NULL}else{as.numeric(factor(the.data[,clusters]))}
  the.covariates <- formula(paste("~",paste(ctrls,collapse="+")))
    reg <- lm_lin(formula=the.formula, covariates=the.covariates
                  , data=the.data
                  , clusters = the.data$clusters
                  , se_type = "stata"
                  , weights = w) 
    results_t[1:7] <- round(summary(reg)$coef["Z",],5)
    results_t[8] <- reg$nobs #updated due to package update
    ## Standardized Effect
    tmp <- data.frame(y = the.data[,outcomes[index]], g = the.data$Z)
    pool_sd <- pooled.sd(na.omit(tmp))
    results_t[1,9] <- round(results_t[1,1]/pool_sd,5)
  #}
  names(results_t) <- c(colnames(summary(reg)$coef),"N","Std. Estimate")
  return(results_t)
}

#CACE with or without controls
est_cace <- function(index, treat = "Z",  compliance="D", fe=NULL
                              , ctrls=NULL
                              , outcomes=outcomes, clusters=NULL
                              , the.data=survey, w=NULL){
  require("ANOVAreplication")
  results_t <- as.data.frame(matrix(NA, 1, 9))
  the.data$D <- as.numeric(as.matrix(the.data[,compliance]))
  the.data$Z <- as.numeric(as.matrix(the.data[,treat]))
  the.data$clusters <- if(is.null(clusters)){NULL}else{as.numeric(the.data[,clusters])}
  if(is.null(ctrls)){#no controls
    the.formula.D <- paste("D")
    the.formula.Z <- paste("Z")
  }else{#with controls
    the.data[,ctrls] <- the.data[,ctrls]
    the.formula.D <- paste("D + ",paste(ctrls,collapse="+"))
    the.formula.Z <- paste("Z + ",paste(ctrls,collapse="+"))
  }
  the.formula <- formula(paste(outcomes[index],"~",the.formula.D,"|",the.formula.Z))
  if (length(fe)==0){#without fixed effects...
  # Estimation
  reg <- iv_robust(the.formula        
                   , data = the.data
                   , se_type = "stata"
                   , clusters = the.data$clusters
                   , weights = w
  )
  results_t[1:7] <- round(summary(reg)$coefficients["D",],5)
  results_t[1,8] <- reg$nobs #due to package update
  # Standardizes results 
  tmp <- data.frame(y = the.data[,outcomes[index]], g = the.data$Z)
  pool_sd <- pooled.sd(na.omit(tmp))
  results_t[1,9] <- round(results_t[1,1]/pool_sd,5)
  names(results_t) <- c(colnames(summary(reg)$coef),"N","Std. Estimate")
  }
  else{#with fixed effects
  the.data$block <- factor(the.data[,fe])
  # Estimation
  reg <- iv_robust(the.formula
                     , data = the.data
                     , se_type = "stata"
                     , fixed_effects = ~block
                     , clusters = if(is.null(the.data$clusters)){NULL}else{the.data$clusters}
                     , weights = w
    )
    results_t[1:7] <- round(summary(reg)$coefficients["D",],5)
    results_t[1,8] <- reg$nobs #due to package update
    # Standardizes results 
    tmp <- data.frame(y = the.data[,outcomes[index]], g = the.data$Z)
    pool_sd <- pooled.sd(na.omit(tmp))
    results_t[1,9] <- round(results_t[1,1]/pool_sd,5)
    names(results_t) <- c(colnames(summary(reg)$coef),"N","Std. Estimate")
  }
  return(results_t)
}


#Scaling
scalev <- function(outcomes,the.data){
  outcome_mat <- the.data[,outcomes]
  Z <- the.data$Z
  c_mean <- apply(X = outcome_mat[which(Z==0),], MARGIN = 2, FUN = mean, na.rm = T)
  c_sd <- apply(X = outcome_mat[which(Z==0),], MARGIN = 2, FUN = sd, na.rm = T)
  out <-     sweep(sweep(outcome_mat,2,c_mean), 2, c_sd, "/")
  return(out)
}

#Comparing estimates
difftest <- function(est1, est2, s1, s2, alpha = 0.05) {
  dom <- est1-est2
  se <- sqrt((s1^2) + (s2^2))
  ci <- se*qnorm(1-(alpha/2))
  pval <- 2*pnorm(abs(dom/se), lower.tail = FALSE)
  result <- c(est1, est2, dom, se, dom-ci, dom+ci, pval)
  names(result) <- c("est.1", "est.2", "dom", "se", "l.ci", 
                     "u.ci", "pval")
  return(result)
}

### R&R function
est_ipw <- function(index, treat = "Z", outcomes=outcomes, ctrls=NULL, 
                    clusters=NULL, the.data=survey, w=NULL){
  results_t <- as.data.frame(matrix(NA, 1, 9))
  the.data$Z <- as.numeric(the.data[,treat])
  the.data$clusters <- if(is.null(clusters)){NULL}else{as.numeric(factor(the.data[,clusters]))}
  if (length(ctrls)==0){
    the.formula <- formula(paste(outcomes[index]," ~ Z"))
    reg <- lm_robust(formula=the.formula 
                     , data=the.data
                     , se_type = "stata"
                     , clusters = the.data$clusters
                     , weights = w) 
    results_t[1:7] <- round(summary(reg)$coef["Z",],5)
    results_t[8] <- reg$nobs #due to package update
    ## Standardized Effect
    tmp <- data.frame(y = the.data[,outcomes[index]], g = the.data$Z)
    pool_sd <- pooled.sd(na.omit(tmp))
    results_t[1,9] <- round(results_t[1,1]/pool_sd,5) 
  }else{#with controls....
    the.formula.covariates <- formula(paste(outcomes[index], paste("~ Z + ",paste(ctrls,collapse="+"))))
    reg <- lm_robust(formula=the.formula.covariates 
                     , data=the.data
                     , clusters = the.data$clusters
                     , se_type = "stata"
                     , weights = w) 
    results_t[1:7] <- round(summary(reg)$coef["Z",],5)
    results_t[8] <- reg$nobs #due to package update
    ## Standardized Effect
    tmp <- data.frame(y = the.data[,outcomes[index]], g = the.data$Z)
    pool_sd <- pooled.sd(na.omit(tmp))
    results_t[1,9] <- round(results_t[1,1]/pool_sd,5)
  }
  names(results_t) <- c(colnames(summary(reg)$coef),"N","Std. Estimate")
  return(results_t)
}


est_lm_lot <- function(index, treat = "Z", outcomes=outcomes, ctrls=NULL, 
                       the.data=survey, w=NULL){
  results_t <- as.data.frame(matrix(NA, 1, 9))
  the.data$Z <- as.numeric(the.data[,treat])
  if (length(ctrls)==0){
    the.formula <- formula(paste(outcomes[index]," ~ Z"))
    reg <- lm_robust(formula=the.formula 
                     , data=the.data
                     , se_type = "stata"
                     , weights = w) 
    results_t[1:7] <- round(summary(reg)$coef["Z",],5)
    results_t[8] <- reg$nobs #due to package update
    ## Standardized Effect
    tmp <- data.frame(y = the.data[,outcomes[index]], g = the.data$Z)
    pool_sd <- pooled.sd(na.omit(tmp))
    results_t[1,9] <- round(results_t[1,1]/pool_sd,5) 
  }else{#with controls....
    the.formula.covariates <- formula(paste(outcomes[index], paste("~ Z + ",paste(ctrls,collapse="+"))))
    reg <- lm_robust(formula=the.formula.covariates 
                     , data=the.data
                     , se_type = "stata"
                     , weights = w) 
    results_t[1:7] <- round(summary(reg)$coef["Z",],5)
    results_t[8] <- reg$nobs #due to package update
    ## Standardized Effect
    tmp <- data.frame(y = the.data[,outcomes[index]], g = the.data$Z)
    pool_sd <- pooled.sd(na.omit(tmp))
    results_t[1,9] <- round(results_t[1,1]/pool_sd,5)
  }
  names(results_t) <- c(colnames(summary(reg)$coef),"N","Std. Estimate")
  return(results_t)
}

# Declare a function to predict outcomes in both waves and store the relevant test
pred_out <- function(index, treat = "wave2", outcomes=outcomes, ctrls=pre.treat, fe= "id_edital"
                     , clusters="fieldID", the.data=pooled.ctrls, w.var="sampling_prob"){
  require(car)
  results_t <- as.data.frame(matrix(NA, 1, 8))
  the.data$Z <- as.numeric(the.data[,which(names(the.data)==treat)])
  the.data$clusters <- if(is.null(clusters)){NULL}else{as.numeric(factor(the.data[,clusters]))}
  the.data$block <- if(is.null(fe)){0}else{factor(the.data[,fe])}
  the.data$w <- 1/the.data[,w.var]
  if(is.null(fe)){
    the.formula <- formula(paste(outcomes[index],"~ wave2 * (",paste(ctrls,collapse="+"),")"))
  }else{
    the.formula <-  formula(paste(outcomes[index],"~",
                                  paste(ctrls,collapse="+"), 
                                  paste("+wave2:(",paste(ctrls,collapse="+"),")")))
  }
  reg <- lm_robust(formula=the.formula 
                   , data=the.data
                   , fixed_effects = ~block
                   , se_type = "stata"
                   , clusters = the.data$clusters
                   , weights = w) 
  interact.coefs <- grep(":",names(coef(reg)))
  jointf <- linearHypothesis(reg,paste(names(coef(reg)[interact.coefs]),"=0"),test="F")
  results_t[1:3]=c(F=jointf$F[2],p=jointf$Pr[2],df=jointf$Df[2])
  results_t[4] <- reg$nobs
  results_t[5] <- sum(summary(reg)$coef[interact.coefs,"Pr(>|t|)"]< 0.1)
  results_t[6] <- sum(summary(reg)$coef[interact.coefs,"Pr(>|t|)"]< 0.05)
  results_t[7] <- sum(summary(reg)$coef[interact.coefs,1]> 0)
  results_t[8] <- sum(summary(reg)$coef[interact.coefs,1]<= 0)
  
  
  names(results_t) <- c("F","P-val","df","N","I*","I**","Pos","Neg")
  return(results_t)
}

nsnr <- function(x){  
  if(class(x)=="numeric"|class(x)=="integer"){
    ifelse(is.element(x,c(88,98,99)),NA,x)
  }else{
    x <- gsub("^\\s?-|–\\s","",x)
    ifelse(is.element(tolower(as.character(x)),
                      c("ns/nr","ns","nr","nsa","não sabe/ não respondeu","não respondeu","não sabe","não respondeu/ recusa", "não sabe (esp.)", "não respondeu (esp.)")),NA,as.character(x))}
}

# Evaluation by income level(plot all)
all.lm <- function(i){
  the.formula <- formula(paste(eval.vars[i],"~fam.income.imp-1"))
  out <- lm(the.formula,data=d)
}

my.baseplot<-function(x,ymin=0.5,ymax=1,my.axis=T){#background empty plot
  plot(c(.5,x+.5),c(ymin,ymax),type="n",xlab="",ylab="",xaxt="n",yaxt="n",bty="n")
  for(i in 1:x){polygon(x=c(i-.5,i+.5,i+.5,i-.5),
                        y=c(-1,-1,11,11),border=NA,
                        col=ifelse(i/2==round(i/2),gray(0.85),"white"))}
  if(my.axis){axis(side=2,at=seq(ymin,ymax,by=1),labels=seq(ymin,ymax,by=1))}
}

the.plot  <- function(x,ymin=3,ymax=9,my.axis=T){#creates the plot with same scale, calls other ploting fcts
  my.baseplot(length(x),ymin=ymin,ymax=ymax,my.axis=my.axis)
  xx<- lapply(x,my.regs)
  for(i in 1:length(x)){my.plot(xx[[i]],i)}
}

my.regs <- function(x,xs,a=.95){
  require(plotrix)
  baseline <- mean(x$model[,1])
  points <- summary(x)$coef[,1]
  ciwd <- summary(x)$coef[,2]*qnorm(a)
  output <- list(baseline=baseline,est=cbind(points,ciwd))
  return(output)
}

my.plot <- function(xx,the.xs=1,sym.col=T){
  #workhouse function for plot using the same yscale for all quesitons
  require(plotrix)
  parties <- nrow(xx$est)
  xs <- the.xs+seq(-.275,.275,len=parties)  
  if(sym.col==T){
    plotCI(xs,xx$est[,1],uiw=xx$est[,2]
           ,col=c(1,gray(0.25),gray(0.25),gray(0.25),1),lwd=1.5,sfrac=0.005
           ,pch=NA,ylab="",xlab="",yaxt="n",xaxt="n",bty="n",add=T)
    points(xs,xx$est[,1],pch=21:(21+parties)
           ,bg=c(1,rep("white",parties-2),1)
           ,col=c(1,rep(gray(0.45),parties-2),1))}
  if(sym.col==F){
    plotCI(xs,xx$est[,1]+xx$baseline,uiw=xx$est[,2]
           ,col=c(1,gray(0.25),gray(0.25),gray(0.25),gray(0.25)),lwd=1.5,sfrac=0.005
           ,pch=NA,ylab="",xlab="",yaxt="n",xaxt="n",bty="n",add=T)
    points(xs,xx$est[,1]+xx$baseline,pch=21:(21+parties)
           ,bg=c(1,rep("white",parties-1))
           ,col=c(1,rep(gray(0.25),parties-1)))}		
  
  segments(x0=xs[1],x1=xs[parties],y0=xx$baseline,y1=xx$baseline,lty=2,lwd=1.5)
}

## Heterogenous effects for treatment only moderators (considered as two different treatments)
reg_hetfactor <- function(index, treat = "Z", outcomes=outcomes, ctrls=NULL, fe= NULL
                          , clusters=NULL, the.data=survey, w=NULL){
  if(length(levels(factor(the.data[,treat])))!=3){stop("Function designed to work with three level factor as treatment\n")}
  require("ANOVAreplication")
  results_t <- as.data.frame(matrix(NA, 1, 20))
  the.data$Zcat <- factor(the.data[,treat])
  the.data$clusters <- if(is.null(clusters)){NULL}else{as.numeric(factor(the.data[,clusters]))}
  the.data$block <- if(is.null(fe)){0}else{factor(the.data[,fe])}
  if (length(ctrls)==0){
    the.formula <- formula(paste(outcomes[index]," ~ Zcat"))
    reg <- lm_robust(formula=the.formula 
                     , data=the.data
                     , fixed_effects = ~block
                     , se_type = "stata"
                     , clusters = the.data$clusters
                     , weights = w) 
  }else{#with controls....
    the.formula.covariates <- formula(paste(outcomes[index], paste("~ Zcat + ",paste(ctrls,collapse="+"))))
    reg <- lm_robust(formula=the.formula.covariates 
                     , data=the.data
                     , fixed_effects = ~block
                     , clusters = the.data$clusters
                     , se_type = "stata"
                     , weights = w) 
  }
  ## Save coefs for low and high treatments
  low.row <- grep('Z_1',rownames(summary(reg)$coef))
  high.row <- grep('Z_2',rownames(summary(reg)$coef))
  results_t[1:7] <- round(summary(reg)$coef[low.row,],5)
  results_t[9:15] <- round(summary(reg)$coef[high.row,],5)
  ## Standardized Effect (for low and high treatment)
  tmp <- data.frame(y = the.data[,outcomes[index]], g = the.data$Z)
  pool_sd <- pooled.sd(na.omit(tmp))
  results_t[1,8] <- round(results_t[1,1]/pool_sd,5)
  results_t[1,16] <- round(results_t[1,9]/pool_sd,5) 
  results_t[1,17] <- reg$nobs #updated when package changed
  ## Test difference between low and high coefs
  results_t[1,18] <- coef(reg)[1]- coef(reg)[2]
  results_t[1,19] <- sqrt(vcov(reg)[1,1]+vcov(reg)[2,2]-2*vcov(reg)[1,2])
  results_t[1,20] <- round(linearHypothesis(reg, 
                                            paste0(names(coef(reg))[low.row],"=",names(coef(reg))[high.row]), 
                                            singular.ok = TRUE, 
                                            test=c("F"))$`Pr(>F)`[2],5) #this needed because of multicollinearity
  rownames(results_t) <- outcomes[index]
  labs <- names(table(the.data[,treat]))[c(2,3)]
  cols <- names(round(summary(reg)$coef[low.row,],5))
  names(results_t) <- c( 
    paste0(cols,labs[1]),paste0("Std.Est",labs[1]),
    paste0(cols,labs[2]),paste0("Std.Est",labs[2]),
    "n","diff","se_diff","p_diff")
  #remove excess columns...
  to.drop <- grep("^t|^CI|^DF",names(results_t))
  results_t <- results_t[,-to.drop]
  return(results_t)
}



